#Nature's Kidneys: the Role of Wetland Reserve Easements in Restoring Water Quality
#Nicole Karwowski and Marin Skidmore
#Replication File
#2025

#Process the TRI Point Source Pollution Data

# Load packages
library(dplyr)
library(tidyverse)
library(sf)
library(tmap)
library(raster)
library(readr)
library(dplyr)

#set wd
setwd("C:/Users/n57m645/OneDrive - Montana State University/UW_Madison/Water quality/Replication Package")

#-----------------------APPEND TRI DATA TO ONE DATA SET-------------------------#

# Create a for-loop to load TRI data 
years = 1987:2023
data_list <- list("Data/TRI/Raw")

for (year in years) {
  # Construct the filename with correct relative path
  filename <- file.path("Data/TRI/Raw", paste0(year, "_us.csv"))
  
  if (file.exists(filename)) {
    # Read the CSV, treating the first line as the header 
    data <- read.csv(filename, header = TRUE)
    # Store the data frame in the list
    data_list[[as.character(year)]] <- data
  } else {
    cat("File not found:", filename, "\n")
  }
}

# Combine all data frames into one
if (length(data_list) > 0) {
  combined_data <- do.call(rbind, data_list)
}

tri_data = combined_data


#-------------------REDUCE DATA SET TO NECESSARY INFORMATION--------------------#

# Rename variables (they were named weirdly in Excel so want to fix that) and replace spaces with _
names(tri_data) <- sub("^X\\d+\\.\\.", "", names(tri_data))  # Remove the prefix like "X1.." or "X14.."
names(tri_data) <- gsub("\\.", "_", names(tri_data))  # Replace periods with underscores

# View the variable names in tri_data
colnames(tri_data)

# Rename some variables in the data set
tri_rename = tri_data |> 
  rename(FACILITY_ID = TRIFD,
         CHEMICAL_NAME = CHEMICAL,
         CHEMICAL_ID = TRI_CHEMICAL_COMPOUND_ID,
         WATER_MEASURE = `5_3___WATER`)

# Select only the variables of interest
tri_drop_vars = tri_rename |> 
  dplyr::select(YEAR, 
         FACILITY_ID, 
         FACILITY_NAME, 
         LATITUDE, 
         LONGITUDE, 
         CHEMICAL_NAME, 
         CHEMICAL_ID, 
         UNIT_OF_MEASURE, 
         WATER_MEASURE,
         ON_SITE_RELEASE_TOTAL)

# Make sure that for each chemical name, the chemical id is the same as what we know 
unique_ammonia <- tri_drop_vars |>
  filter(CHEMICAL_NAME == "Ammonia") |>
  distinct(CHEMICAL_ID)

unique_phosphorus <- tri_drop_vars |>
  filter(grepl("Phosphorus", CHEMICAL_NAME, ignore.case = TRUE)) |>
  distinct(CHEMICAL_ID)

unique_nitrate <- tri_drop_vars |>
  filter(grepl("Nitrate compound", CHEMICAL_NAME, ignore.case = TRUE)) |>
  distinct(CHEMICAL_ID)

# All have the chemical ID that we expected and no other IDs are associated with each chemical

# Filter the data to only the chemicals of interest 
tri_chemicals = tri_drop_vars |> 
  filter(CHEMICAL_ID == "0012185103" | CHEMICAL_ID == "N511" | CHEMICAL_ID == "0007664417")


#-----------------------------PRELIMINARY CLEANING------------------------------#

# Check to see if the panel is facility-year level
tri_check = tri_chemicals |>
  group_by(YEAR, FACILITY_ID) |>
  summarise(unique_id = n_distinct(FACILITY_ID)) |>
  ungroup()

unique(tri_check$unique_id)
# 1 for all unique_id 

# Look at data structure 
str(tri_chemicals)

# Check for duplicates
duplicates = tri_chemicals |> 
  count(across(everything())) |> 
  filter(n > 1)

# Calculate number of observations that should be dropped
observations_to_drop <- sum((duplicates$n) - 1)

# There are duplicates so keep only unique observations 
tri_panel_unique = tri_chemicals |> 
  distinct()

# Calculate difference in observations between tri_chemicals and tri_panel_unique (should be equal
# to observations_to_drop (243))
n_original = nrow(tri_chemicals)
n_unique = nrow(tri_panel_unique)

difference = n_original - n_unique
# The difference is 243, so the distinct() command worked how it should to remove duplicates

# Check the unit of measurement for the data and convert if necessary
unique(tri_panel_unique$UNIT_OF_MEASURE) 

# All in pounds, so no need to convert (unless do not want in pounds)

# Check for NA values in the data
sum(is.na(tri_panel_unique))

# No NA values

# Make sure that there is never a numeric value for WATER_MEASURE when ON_SITE_RELEASE_TOTAL is 0
# This may be able to answer whether 0's are valid and not actually NA values.
violations = tri_panel_unique |>
  filter(ON_SITE_RELEASE_TOTAL == 0 & !is.na(WATER_MEASURE) & WATER_MEASURE != 0)

# 0 violations 

# Also check that on site release total is always greater than or equal to the water release
check = all(tri_panel_unique$ON_SITE_RELEASE_TOTAL >= tri_panel_unique$WATER_MEASURE)

# True


#-----------------------------CONVERT TO WIDE FORMAT----------------------------#

# NOTE: All measurements are in pounds, and the chemical id's for each chemical name are listed in 
# unique_ammonia, unique_nitrate, and unique_phosphorus (refer to lines 65 - 75)

# NOTE: In cases where there are multiple observations with the same FACILITY_ID, YEAR, CHEMICAL 
# combination, but the output (total and water) is different, these are just because of industry 
# differences. We don't care about industry differences, so we are summing these up. 

# Convert tri_panel_unique to wide format and select only columns of interest
tri_panel_wide <- tri_panel_unique %>%
  mutate(
    CHEMICAL_TYPE = case_when(
      CHEMICAL_ID == "0007664417" ~ "AMMONIA",
      CHEMICAL_ID == "0012185103" ~ "PHOSPHORUS",
      CHEMICAL_ID == "N511" ~ "NITRATE",
      TRUE ~ NA_character_
    ),
    ON_SITE_RELEASE_TOTAL = as.numeric(ON_SITE_RELEASE_TOTAL),
    WATER_MEASURE = as.numeric(WATER_MEASURE)
  ) %>%
  dplyr::select(
    YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE,
    CHEMICAL_TYPE, ON_SITE_RELEASE_TOTAL, WATER_MEASURE
  ) %>%
  pivot_wider(
    names_from = CHEMICAL_TYPE,
    values_from = c(ON_SITE_RELEASE_TOTAL, WATER_MEASURE),
    values_fn = sum,
    names_glue = "{CHEMICAL_TYPE}_{.value}"
  ) %>%
  rename(
    AMMONIA_TOTAL = AMMONIA_ON_SITE_RELEASE_TOTAL,
    AMMONIA_WATER = AMMONIA_WATER_MEASURE,
    PHOSPHORUS_TOTAL = PHOSPHORUS_ON_SITE_RELEASE_TOTAL,
    PHOSPHORUS_WATER = PHOSPHORUS_WATER_MEASURE,
    NITRATE_TOTAL = NITRATE_ON_SITE_RELEASE_TOTAL,
    NITRATE_WATER = NITRATE_WATER_MEASURE
  )

# Check that all observations for ammonia are in the tri_panel_wide data set
ammonia = tri_panel_unique |> 
  filter(CHEMICAL_ID == "0007664417") |>
  dplyr::select(YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE, ON_SITE_RELEASE_TOTAL, WATER_MEASURE) |>
  group_by(YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE) |>
  summarize(
    AMMONIA_TOTAL = sum(as.numeric(ON_SITE_RELEASE_TOTAL)),
    AMMONIA_WATER = sum(as.numeric(WATER_MEASURE))) |>
  ungroup()

ammonia_missing = tri_panel_wide |> 
  dplyr::select(-PHOSPHORUS_TOTAL, -PHOSPHORUS_WATER, -NITRATE_TOTAL, -NITRATE_WATER) |> 
  filter(!is.na(AMMONIA_TOTAL), !is.na(AMMONIA_WATER)) |>
  setdiff(ammonia) 

# 0 observations, so all observations in ammonia are in tri_panel_wide

# Check that all observations for phosphorus are in the tri_panel_wide data set
phosphorus = tri_panel_unique |> 
  filter(CHEMICAL_ID == "0012185103") |>
  dplyr::select(YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE, ON_SITE_RELEASE_TOTAL, WATER_MEASURE) |>
  group_by(YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE) |>
  summarize(
    PHOSPHORUS_TOTAL = sum(as.numeric(ON_SITE_RELEASE_TOTAL)),
    PHOSPHORUS_WATER = sum(as.numeric(WATER_MEASURE))) |>
  ungroup()

phosphorus_missing = tri_panel_wide |> 
  dplyr::select(-AMMONIA_TOTAL, -AMMONIA_WATER, -NITRATE_TOTAL, -NITRATE_WATER) |> 
  filter(!is.na(PHOSPHORUS_TOTAL), !is.na(PHOSPHORUS_WATER)) |>
  setdiff(phosphorus) 

# 0 observations, so all observations in phosphorus are in tri_panel_wide

# Check that all observations for nitrate are in the tri_panel_wide data set
nitrate = tri_panel_unique |> 
  filter(CHEMICAL_ID == "N511") |>
  dplyr::select(YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE, ON_SITE_RELEASE_TOTAL, WATER_MEASURE) |>
  group_by(YEAR, FACILITY_ID, FACILITY_NAME, LATITUDE, LONGITUDE) |>
  summarize(
    NITRATE_TOTAL = sum(as.numeric(ON_SITE_RELEASE_TOTAL)),
    NITRATE_WATER = sum(as.numeric(WATER_MEASURE))) |>
  ungroup()

nitrate_missing = tri_panel_wide |> 
  dplyr::select(-AMMONIA_TOTAL, -AMMONIA_WATER, -PHOSPHORUS_TOTAL, -PHOSPHORUS_WATER) |> 
  filter(!is.na(NITRATE_TOTAL), !is.na(NITRATE_WATER)) |>
  setdiff(nitrate) 

# 0 observations, so all observations in nitrate are in tri_panel_wide

# Save data 
save(tri_panel_wide, file = "Data/TRI/Processed/tri_panel_wide.Rda")


#---------------------------MERGE WITH MARB SHAPEFILE---------------------------#

# Convert tri_panel_wide data to geo-spatial using FACILITY_NAME, FACILITY_ID, LATITUDE, LONGITUDE
tri_spatial = st_as_sf(
  tri_panel_wide,
  coords = c("LONGITUDE", "LATITUDE"), 
  crs = 4326
)

# Map the data
tmap_mode("view")
qtm(tri_spatial)

# Looks good. Note that there are observations for Guam and Samoa.  

# Load in the MARB spatial data
huc12_usa = st_read("Data/WatershedBoundaries/Raw/USGS_WBD/USGS_WBD.shp") |> 
  st_make_valid()

# Map the data
tmap_mode("view")
qtm(huc12_usa)

# Keep only columns of interest 
huc12_usa = huc12_usa[, c("huc12", "geometry")]

# Check crs 
crs(huc12_usa)
crs(tri_spatial)

# Merge the data using st_join
spatial_join = st_join(huc12_usa, tri_spatial, 
                       join = st_intersects)

# Rename Huc12 as HUC12 for consistency in variable names 
tri_huc12_usa = spatial_join |>
  rename(HUC12 = huc12)

# Map the data
tmap_mode("view")
tm_shape(tri_huc12_usa) +
  tm_polygons(col = "grey", alpha = 0.5) +
  tm_shape(tri_spatial) +
  tm_dots(col = "red")

# Convert tri_panel_spatial to a data frame 
tri_huc12_usa = subset(as.data.frame(tri_huc12_usa), select = c(FACILITY_ID, YEAR, HUC12))

# Check structure of tri_panel_wide and tri_panel_spatial 
str(tri_panel_spatial)
str(tri_panel_wide)

# Merge tri_panel_spatial and tri_panel_wide 
merge_tri = merge(tri_panel_wide, tri_huc12_usa, by = c("FACILITY_ID", "YEAR"), all.x = TRUE, all.y = FALSE)

# Aggregate chemical compounds (total and water) for each HUC12 in each year
tri_huc12_panel = merge_tri |>
  group_by(HUC12, YEAR) |>
  summarise(
    AMMONIA_TOTAL = sum(AMMONIA_TOTAL, na.rm = TRUE),
    AMMONIA_WATER = sum(AMMONIA_WATER, na.rm = TRUE),
    NITRATE_TOTAL = sum(NITRATE_TOTAL, na.rm = TRUE),
    NITRATE_WATER = sum(NITRATE_WATER, na.rm = TRUE),
    PHOSPHORUS_TOTAL = sum(PHOSPHORUS_TOTAL, na.rm = TRUE),
    PHOSPHORUS_WATER = sum(PHOSPHORUS_WATER, na.rm = TRUE)
  ) |> 
  ungroup()

summary(tri_huc12_panel)
table(tri_huc12_panel$YEAR)

# Save data 
save(tri_huc12_panel, file = "Data/TRI/Processed/tri_huc12_panel.Rda")
